import pandas as pd
import time

# 数据清洗
def data_cleaning(file_in_path, file_out_path):
    reader = pd.read_csv(file_in_path, chunksize=1000000)
    for index, df in enumerate(reader):
        df = df.dropna()  # 处理空值
        df = df.drop_duplicates()  # 删除重复值
        df = df[(df.latitude >= 35.21) & (df.latitude <= 37.31)]  # 处理经纬度
        df = df[(df.longitude >= 107.41) & (df.longitude <= 110.31)]
        df = df[(df.state == 3)]   # 去掉所有空车状态的数据
        df = df[(df.speed == 0)]   # 处理速度
        if index == 0:   # 检查是否是第一次写入数据,决定是否写入header
            df.to_csv(file_out_path, mode="a", index=False)   # 追加写入新的csv文件
        else:
            df.to_csv(file_out_path, mode="a", index=False, header=0)  # 追加写入新的csv文件

if __name__ == '__main__':
    start = time.time()
    filein = '..\\data\\data_original\\aH1806.csv'
    fileout = '..\\data\\clean_file\\clean1806.csv'
    data_cleaning(filein, fileout)
    end = time.time()
    print("Runtime: " + str(end - start))
